# sys, file and nav packages:
import datetime as dt

# math packages:
import pandas as pd
import numpy as np
from scipy import stats
from statsmodels.distributions.empirical_distribution import ECDF

# charting:
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from matplotlib import ticker
from matplotlib import colors
import seaborn as sns
import matplotlib.gridspec as gridspec

# home brew utitilties
import resources.utility_functions as ut
import resources.abundance_classes as ac
import resources.chart_kwargs as ck

import resources.sr_ut as sut

# images and display
import base64, io, IPython
from PIL import Image as PILImage
from IPython.display import Markdown as md
from IPython.display import display, Math, Latex
import matplotlib.image as mpimg


# set some parameters:
today = dt.datetime.now().date().strftime("%Y-%m-%d")
start_date = '2020-03-01'
end_date ='2021-05-31'

sns.set_style('whitegrid')

a_fail_rate = 50

unit_label = 'p/100m'
reporting_unit = 100

# name of the output folder:
name_of_project = 'trans_report_all'

a_color = 'dodgerblue'

# set the maps
bassin_map = PILImage.open("resources/maps/survey_locations_all.jpeg")
bassin_pallette = {'rhone':'dimgray', 'aare':'salmon', 'linth':'tan', 'ticino':'steelblue', 'reuss':'purple'}

# define the feature level and components
comps = ['linth', 'rhone', 'aare', 'ticino']
comp_labels = {"linth":"Linth/Limmat", "rhone":"Rhône", 'aare':"Aare", "ticino":"Ticino/Cerisio", "reuss":"Reuss"}
comp_palette = {"Linth/Limmat":"dimgray", "Rhône":"tan", "Aare":"salmon", "Ticino/Cerisio":"steelblue", "Reuss":"purple"}

# explanatory variables:
luse_exp = ['% to buildings', '% to recreation', '% to agg', '% to woods', 'streets km', 'intersects']

# columns needed
use_these_cols = ['loc_date' ,
                  '% to buildings',
                  '% to trans', 
                  '% to recreation',
                  '% to agg',
                  '% to woods',
                  'population',
                  'river_bassin',
                  'water_name_slug',
                  'streets km',
                  'intersects',
                  'length',
                  'groupname',
                  'code'
                 ]

# these are default
top_name = ["All survey areas"]

# add the folder to the directory tree:
project_directory = ut.make_project_folder('output', name_of_project)

# get your data:
survey_data = pd.read_csv('resources/results_with_land_use_2015.csv')
river_bassins = ut.json_file_get("resources/river_basins.json")
dfBeaches = pd.read_csv("resources/beaches_with_land_use_rates.csv")
dfCodes = pd.read_csv("resources/codes_with_group_names_2015.csv")
dfDims = pd.read_csv("resources/dims_data.csv")

# set the index of the beach data to location slug
dfBeaches.set_index('slug', inplace=True)

city_map = dfBeaches['city']

# map locations to feature names
location_wname_key = dfBeaches.water_name_slug

# map water_name_slug to water_name
wname_wname = dfBeaches[['water_name_slug','water_name']].reset_index(drop=True).drop_duplicates()
wname_wname.set_index('water_name_slug', inplace=True)
        
def make_plot_with_spearmans(data, ax, n):
    """Gets Spearmans ranked correlation and make A/B scatter plot. Must proived a
    matplotlib axis object.
    """    
    sns.scatterplot(data=data, x=n, y=unit_label, ax=ax, color='black', s=30, edgecolor='white', alpha=0.6)
    corr, a_p = stats.spearmanr(data[n], data[unit_label])
    
    return ax, corr, a_p

dfCodes.set_index("code", inplace=True)

# these descriptions need to be shortened for display
dfCodes = sut.shorten_the_value(["G74", "description", "Insulation: includes spray foams"], dfCodes)
dfCodes = sut.shorten_the_value(["G940", "description", "Foamed EVA for crafts and sports"], dfCodes)
dfCodes = sut.shorten_the_value(["G96", "description", "Sanitary-pads/tampons, applicators"], dfCodes)
dfCodes = sut.shorten_the_value(["G178", "description", "Metal bottle caps and lids"], dfCodes)
dfCodes = sut.shorten_the_value(["G82", "description", "Expanded foams 2.5cm - 50cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G81", "description", "Expanded foams .5cm - 2.5cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G117", "description", "Expanded foams < 5mm"], dfCodes)
dfCodes = sut.shorten_the_value(["G75", "description", "Plastic/foamed polystyrene 0 - 2.5cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G76", "description", "Plastic/foamed polystyrene 2.5cm - 50cm"], dfCodes)
dfCodes = sut.shorten_the_value(["G24", "description", "Plastic lid rings"], dfCodes)
dfCodes = sut.shorten_the_value(["G33", "description", "Lids for togo drinks plastic"], dfCodes)
dfCodes = sut.shorten_the_value(["G3", "description", "Plastic bags, carier bags"], dfCodes)
dfCodes = sut.shorten_the_value(["G204", "description", "Bricks, pipes not plastic"], dfCodes)

# make a map to the code descriptions
code_description_map = dfCodes.description

# make a map to the code descriptions
code_material_map = dfCodes.material

6. Shared responsibility

Identification, quantification and analysis of anthropogenic debris in Swiss waters (IQAASL) is a project funded by the Swiss federal office of the environment (FOEN) that was conducted from January 2020 - December 2021. Over 300 beach-litter surveys were conducted in four catchment areas following the Guide for Monitoring Marine Litter (The Guide) [Han13].

Research on litter transport and accumulation in the aquatic environment has come to the conclusion that rivers are primary sources of land based macro-plastics to the marine environment [GFCH+21]. However not all objects that are transported by rivers make it to ocean, suggesting that rivers and inland lakes are also sinks for a portion of the macro-plastics that are emitted. [KBK+18]

In Switzerland this places the front line of reducing litter in the water directly on the municipal and cantonal administrations [fdlCs20]. Litter is a serious discussion in Switzerland. The first national litter survey was conducted from 2017 - 2018 [Bla18], the Swiss Litter Report used the methods defined by The Guide. Over 1000 surveys were completed in twelve months covering much of the national territory.

There are provisions in the law to account for unlawful disposal, commonly known as the principle of polluter payer. According to art. 2 of the Federal Act on the Protection of the Environment (LPE) the principal of causality is defined as :

The person who initiates a measure prescribed by this Act shall bear the costs thereof. [fdlCs20]

In Switzerland the elimination of waste and the management of litter is covered under chapter four of the LPE and the Ordinance on the limitation and disposal of waste (OLED) [fs20]. Article 31b of the LPE places the responsibility for the removal of litter on the cantons:

Municipal waste, waste from roads and public sewage treatment plants, as well as waste whose holder cannot be identified (litter) or is insolvent, are disposed of by the cantons.

In reference to the LPE and OLED the federal court ruling of 2012 for the city of Bern concluded that:

In the case of urban waste in the public space, the perpetrator often cannot be identified and that, consequently, those responsible further up the chain of causation may be required to contribute to the financing. [fdlenvironnement18]

Cantons and municipalities maintain the responsibility of the elimination and management of litter because legally they are owners of the land within their boundaries. However, this ruling allows for a more realistic application of the principle of causality.

f the person responsible for the release of municipal waste into the public space cannot be identified (littering or public garbage cans), it is permissible to consider companies or persons further up the chain of causality as producers of waste and to charge disposal fees to them (e.g. fast-food companies and similar businesses, or organizers of events that generate large quantities of waste on the public space) insofar as objectively founded criteria allow it. [fc12]

The Federal courts ruling and the current law provide a common sense legal basis to attribute the responsibility for litter if the specific offender cannot be identified. The responsibility may be attributed to another person or enterprise insofar as objectively founded criteria allow it

6.1. The Challenge

The challenge is to meet the requirements of objective criteria. The method to meet the requirements needs to be robust, transparent and easily repeatable.

output = io.BytesIO()

this_image = PILImage.open("resources/images/gclosmay2020.jpeg")
this_image.thumbnail((800, 1200))
this_image.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()

html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)

Obtaining objective criteria. Grand Clos, St. Gingoplh May 2020

The challenge is to extract as much information from the litter objects based on the quantity found, the properties of the object and the environmental variables in proximity of the survey location.

Obtaining objective criteria from beach litter data is further complicated because of the \(\approx\) 61’000km of rivers and 1500 lakes in Switzerland [con].

The hydrologic conditions of rivers have an effect on the distance and direction that litter, once introduced into a river will travel. Large low density objects will most likely be transported to the next reservoir or area of reduced flow. High density objects will only be transported if the flow velocity and turbulence of the water are sufficient to keep the objects off the bottom.Once high density items enter a low velocity zone they tend to settle or sink. [SLBH19]

The results from a properly conducted beach litter survey provide many of the elements that describe the nature and the magnitude of the observed litter in an objective way [Han13]:

  1. quantities

  2. locations

  3. object type

  4. material type

  5. use case

  6. dates

  7. description of surrounding environment

The results from the latest national litter survey in Switzerland suggest that objects like cigarette ends and food wrappers are at least 21% of total objects recorded in 2020 and can be attributed to human behavior within 1500m of where it was found. [ham]

However, other objects have no definitive geographic source or no clear association with an activity in proximity to where they are found. The most common of these objects are \(\approx\) 40% of all litter items identified in 2020 -2021 [ham]. Thus there is a clear incentive to identify litter objects that are of local origin and those that may have arrived at the survey location by some-other means.

6.2. The test

The test compares the survey results of two groups of objects under two different land use profiles. The objects tested are based on the results of the Spearmans ranked correlation test on the survey results of the most common objects with respect to land use conditions (annex 1). The results of this test were used to create two groups of objects.

  1. Objects that have multiple positive associations to land use features and one association is to buildings

  2. Objects that have few or no positive associations to land use features and no associations to buildings

The locations were grouped according to the percent of land attributed to buildings, woods and agriculture.

6.2.1. The objects

The most common objects are those objects that were either among the top ten most abundant objects or objects that were found in at least 50% of the surveys. The most common objects were first separated into two groups according to the previously cited criteria:

  1. contributed (CG): objects that have multiple positive associations to land use features and one association is to buildings

    • Cigarette ends

    • Metal bottle tops

    • Snack wrappers

    • Glass bottles and pieces

  2. distributed (DG): objects that have few or no positive associations to land use features

    • Fragmented expanded polystyrene

    • Plastic production pellets

    • Fragmented plastics

    • Cotton swabs

    • Industrial sheeting

    • Construction plastics

Note: cotton swabs are included with DG because they are usually introduced directly into a body of water after passing through a water treatment facility.

  1. other: objects that do not meet the above criteria

Distribtuion group (DG) objects

# read images
img_a = mpimg.imread('resources/codegroups/images/fragplass_dense_450_600.jpg')
img_b = mpimg.imread('resources/codegroups/images/fragfoam_450_600.jpg')
img_c = mpimg.imread('resources/codegroups/images/infrastructure_450_600.jpg')
img_d = mpimg.imread('resources/codegroups/images/gpis_450_600.jpg')

# display images
fig, ax = plt.subplots(2,2, figsize=(12,8))

axone=ax[0,0]
ut.hide_spines_ticks_grids(axone)
axone.imshow(img_a);
axone.set_title("Fragmented plastics", **ck.title_k14)

axtwo=ax[0,1]
ut.hide_spines_ticks_grids(axtwo)
axtwo.imshow(img_b);
axtwo.set_title("Fragmented foams", **ck.title_k14)
axthree=ax[1,0]
ut.hide_spines_ticks_grids(axthree)
axthree.imshow(img_c);
axthree.set_title("Construction plastics", **ck.title_k14)
axfour=ax[1,1]
ut.hide_spines_ticks_grids(axfour)
axfour.set_title("Plastic production pellets", **ck.title_k14)
axfour.imshow(img_d);



plt.tight_layout()
plt.show()
../_images/assessing_transport_7_0.png

Two objects were selected from the CG and DG groups and tested under the same conditions

  1. From the CG group: Food and tobacco (FT)

    • Cigarette ends

    • Snack Wrappers

  2. From the DG group: Fragmented foams and plastics (FP)

    • Fragmented expanded polystyrene

    • Fragmented plastics

6.2.2. The locations

The survey locations were grouped into two classes:

  1. urban: locations that have a percent of land attributed to buildings GREATER than the median

  2. rural: locations that have a percent of land attributed to buildings LESS than the median AND percent of land attributed to woods or agriculture greater than the median

The rural class had 142 surveys for 47 locations versus 158 surveys from 37 locations in the urban class.

6.2.3. The hypothesis

If there is no significant evidence that a land use feature contributes to the accumulation of an object then the distribution of that object should be \(\approx\) under all land use conditions.

Null hypothesis: there is no statistically significant difference between survey results of DG objects at rural and urban locations.

alternate hypothesis: there is a statistically significant difference between survey results of DG objects at rural and urban locations.

6.3. The data

output = io.BytesIO()
bassin_map.thumbnail((800, 1200))
bassin_map.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()

html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)

The survey locations 2020-2021: In total 385 litter-surveys were conducted touching 77 municipalities with a combined population of 1,735,000.

# collect all the data and transform
a_data = survey_data.copy()

# save adated for testing
a_dated = a_data[(a_data.date >= start_date)&(a_data.date <= end_date)].copy()

# format columns and slice by date
a_data['loc_date']=tuple(zip(a_data.location, a_data.date))
a_data['date']=pd.to_datetime(a_data['date'], format='%Y-%m-%d')
a_data = a_data[(a_data.date >= start_date)&(a_data.date <= end_date)]
a_data['material'] = a_data.code.map(lambda x: code_material_map.loc[x])
a_data['city'] = a_data.location.map(lambda x: city_map.loc[x])

# combine survey areas
# combine lugano and maggiore
a_data['river_bassin'] = a_data.river_bassin.where(a_data.river_bassin != 'tresa', 'ticino' )

# combine reuss and linth
a_data['river_bassin'] = a_data.river_bassin.where(a_data.river_bassin != 'reuss', 'linth' )

# convert meters of streets to kilometers
a_data['streets'] = a_data.streets.astype('int')
a_data['streets km'] = a_data.streets/1000

# make a reporting unit column, keep the pcs_m column
a_data[unit_label] = (a_data.pcs_m *reporting_unit).astype('int')

# Combine the different sizes of fragmented plastics and styrofoam
# the codes for the foams, these will be called Gfoam
some_foams = ['G81', 'G82', 'G83']

# the codes for the fragmented plastics, these will be called Gfrags
some_frag_plas = list(a_data[a_data.groupname == 'plastic pieces'].code.unique())

# this extracts and aggregates the members of the two code groups, returns a dataframe
the_plast_rows = sut.create_aggregate_groups(a_data, codes_to_agg=some_frag_plas,a_model_code="G79", a_new_code="Gfrags")
the_foam_rows = sut.create_aggregate_groups(a_data, codes_to_agg=some_foams, a_model_code="G82", a_new_code="Gfoam")

# the foam codes and fragmented plastic codes have been aggregated in to Gfrags and Gfoam
a_data = sut.replace_a_group_of_codes_with_one(a_data, new_code_values=[the_plast_rows, the_foam_rows], codes_to_replace=[*some_frag_plas, *some_foams])

# code and material totals
material_totals = a_data.groupby('material').quantity.sum()
code_totals = a_data.groupby(['code'], as_index=False).agg({unit_label:'mean', 'quantity':'sum'})

# cumulative statistics for each code
code_totals["% of total"] = ((code_totals.quantity/code_totals.quantity.sum())*100).round(2)
code_totals["fail"] = code_totals.code.map(lambda x: a_data[(a_data.code == x) & (a_data.quantity > 0)].loc_date.nunique())
code_totals["fail rate"] = ((code_totals.fail/a_data.loc_date.nunique())*100).astype('int')
code_totals.set_index('code', inplace=True)
code_totals['material'] = code_totals.index.map(lambda x: code_material_map[x])
code_totals['item'] = code_totals.index.map(lambda x: code_description_map[x])

# survey totals
dt_all = a_data.groupby(['loc_date','location','river_bassin', 'water_name_slug','city','date'], as_index=False).agg({unit_label:'sum', 'quantity':'sum'})

#!! feature data!!#
fd = a_data.copy()

# column headers for the survey area data
fd['survey area'] = fd.river_bassin.map(lambda x: ut.use_this_key(x, comp_labels))

# gather the municpalities and the population:
fd_pop_map = dfBeaches.loc[fd.location.unique()][['city','population']].copy()
fd_pop_map.drop_duplicates(inplace=True)
fd_pop_map.set_index('city', drop=True, inplace=True)

# survey_totals feature data
fd_dt=dt_all.copy()
fd_dt['month'] = fd_dt.date.dt.month

# map survey total quantity to loc_date
fd_dq = fd_dt[['loc_date', 'quantity']].set_index('loc_date')

# material totals:
fd_mat_t = code_totals[['material', 'quantity']].groupby('material', as_index=False).quantity.sum()
fd_mat_t["% of total"] = fd_mat_t.quantity/fd_mat_t.quantity.sum()

# material totals
fd_mat_totals = fd_mat_t.sort_values(by='quantity', ascending=False)

mat_cols_to_use = {'material':'Material','quantity':'Quantity', '% of total':'% of total'}
fd_mat_totals['% of total'] =(fd_mat_totals['% of total']*100).round(1)
fd_mat_totals['quantity']=fd_mat_totals.quantity.map(lambda x: '{:,}'.format(x))
fd_mat_totals['% of total']=fd_mat_totals['% of total'].map(lambda x:F"{x}%")
fd_mat_t = fd_mat_totals[mat_cols_to_use.keys()].values

# summary statistics:
fd_n_samps = len(fd_dt)
fd_n_obj = fd.quantity.sum()
fd_n_locs = fd.location.nunique()
fd_n_munis = len(fd_pop_map.index)
fd_effected_population = fd_pop_map.sum()

fd_locs = fd.location.unique()
fd_samps = fd.loc_date.unique()

# gather the dimensional data for the time frame from dfDims
fd_dims= dfDims[(dfDims.location.isin(fd_locs))&(dfDims.date >= start_date)&(dfDims.date <= end_date)].copy()

# map the survey area name to the dims data record
m_ap_to_survey_area = fd[['location', 'river_bassin']].drop_duplicates().to_dict(orient='records')
a_new_map = {x['location']:x['river_bassin'] for x in m_ap_to_survey_area}

# make a survey area column in the dims data
fd_dims['survey area'] = fd_dims.location.map(lambda x: ut.use_this_key(x, a_new_map))

# map length and area from dims to survey data
st_map = fd_dims[['loc_date', 'length', 'area']].to_dict(orient='records')
amap = {x['loc_date']:{'length':x['length'], 'area':x['area']}for x in st_map}
trans = {x:F"{x}"for x in fd.loc_date.unique()}

def this_map(x,amap,trans, var='length'):
    try:
        data = amap[trans[x]][var]
    except:
        data = 0
    return data  
    
fd['length'] = fd.loc_date.map(lambda x:  this_map(x,amap,trans, var='length'))
fd['area'] = fd.loc_date.map(lambda x:  this_map(x,amap,trans, var='area'))
fd['water'] = fd.location.map(lambda x: dfBeaches['water'][x])

# these surveys are missing area and length data. 
# use the average values from all the surveys at that location to fill in the missing valuesf
make_lengths = fd.loc[fd.location.isin(['baby-plage-geneva','quai-maria-belgia'])].groupby('location').agg({'length':'mean', 'area':'mean'})
fd.loc[fd.loc_date == ('baby-plage-geneva', '2021-04-14'), 'length'] = 84
fd.loc[fd.loc_date == ('baby-plage-geneva', '2021-04-14'), 'area'] = 355
fd.loc[fd.loc_date.isin([('quai-maria-belgia', '2021-02-28'), ('quai-maria-belgia', '2021-01-31')]), 'length'] = 34
fd.loc[fd.loc_date.isin([('quai-maria-belgia', '2021-02-28'), ('quai-maria-belgia', '2021-01-31')]), 'area'] = 145

# gather the land use data for each location
luse_this_study = dfBeaches.loc[fd_locs].copy()
sns.set_style('whitegrid')

# the locations in the area of walensee are missing luse dataf
# we are only comparing locations that have a complete profile
lu_th_st = luse_this_study[luse_this_study.luse_total > 650]

#! the data minus the locations with incomplete land use data
#! including the % ranking of the land use features
grtr_10 = fd[fd.location.isin(lu_th_st.index)].copy()
dt_nw = grtr_10.groupby(use_these_cols[:-2], as_index=False).agg({unit_label:'sum'})

# the cumulative distributions for the different survey areas
ecdfs = {x:{} for x in comps}
ecdfs.update({"All surveys":{}})

for i, n in enumerate(luse_exp):
    for element in comps:
        the_data = ECDF(dt_nw[dt_nw.river_bassin == element][n].values)
        ecdfs[element].update({n:the_data})
        x, y = the_data.x, the_data.y           
    a_all_surveys =  ECDF(dt_nw[n].values)
    ecdfs["All surveys"].update({n:a_all_surveys})

Out of the 385 surveys only those that were on lakes were considered for this analysis.

Survey results urban and rural locations March 2020 - May 2021. Left: Survey totals urban v/s rural, n=300. Right: distribution of survey results urban - rural with detail of code-group results.

# summary stats table
# labels for the .describe function
change_names = {'count':'# samples',
                'mean':F"average {unit_label}",
                'std':'standard deviation',
                '25%':'25%', 
                '50%':'50%',
                '75%':'75%',
                'max':F"max {unit_label}",
                'min':F"min {unit_label}",
                'total objects':'total objects',
                '# locations':'# locations',
                'survey year':'survey year'
               }

def anew_dict(x):
    new_dict = {}
    for param in x.index:
        new_dict.update({change_names[param]:x[param]})
    return new_dict  

#the most abundant
most_abundant = code_totals.sort_values(by="quantity", ascending=False)[:10].index

# found greater than 50% of the time
l_grtr_50 = code_totals[code_totals['fail rate'] >= 50].index

# the most common
most_common = list(set([*most_abundant, *l_grtr_50]))

# eleminate surveys less than 10m and greater than 100m
# restricts surveys to locations on lakes
grtr_10 = grtr_10[(grtr_10.water == 'l')].copy()

# add different date identifiers
grtr_10['month'] = grtr_10.date.dt.month
grtr_10["eom"] = grtr_10.date.map(lambda x: pd.Period(x,freq='M').end_time.date())

def add_pctile_rate(x,ecdfs,feature):
    data = ecdfs['All surveys'][feature](x)
    return np.round((data*100),1)  

# get the percentile ranking of the land use features for each location
grtr_10['b_group'] = grtr_10["% to buildings"].map(lambda x: add_pctile_rate(x,ecdfs,"% to buildings"))
grtr_10['a_group'] = grtr_10["% to agg"].map(lambda x: add_pctile_rate(x,ecdfs,"% to agg"))
grtr_10['w_group'] = grtr_10["% to woods"].map(lambda x: add_pctile_rate(x,ecdfs,"% to woods"))
grtr_10['r_group'] = grtr_10["% to recreation"].map(lambda x: add_pctile_rate(x,ecdfs,"% to recreation"))

# group all the land use % into called profile
# this is a unique identifier for each location
grtr_10['profile'] = list(zip(grtr_10["% to buildings"], grtr_10["% to agg"], grtr_10["% to woods"], grtr_10["% to recreation"]))

# identify rural and urban location
grtr_10['rural'] = ((grtr_10.w_group > 50) | (grtr_10.a_group > 50) ) & (grtr_10.b_group < 50)
grtr_10['rural'] = grtr_10['rural'].where(grtr_10['rural'] == True, 'urban')
grtr_10['rural'] = grtr_10['rural'].where(grtr_10['rural'] == 'urban', 'rural')

# make a list of the objects by their association to buildings, streets recreation and thier use
# food and drink
cont = ["G27", "G30", "G178", "G200"]

# objects with no or few associations and not related to tobacco or food
dist = list(set(most_common) - set(cont))

# lables for the two groups and a label to catch all the other objects
grtr_10['group'] = 'other'
grtr_10['group'] = grtr_10.group.where(~grtr_10.code.isin(dist), 'DG')
grtr_10['group'] = grtr_10.group.where(~grtr_10.code.isin(cont), 'CG')

# survey totals of all locations with its land use profile
initial = ['loc_date','date','rural','group','a_group', 'b_group', 'r_group', 'w_group', 'streets', 'intersects']
grtr_10dt=grtr_10.groupby(initial, as_index=False).agg({unit_label:"sum", "quantity":"sum"})
grtr_10qkey = grtr_10dt[['loc_date', 'quantity']].set_index('loc_date')

# assigning survey totals to the cont and dist survey totals
def wtf(x, grtr_10key):
    return grtr_10key.loc[[x]].values[0][0]

# survey totals of contributed and distributed objects, 
second = ['loc_date', 'group', 'rural', 'date','eom', 'river_bassin','location','a_group', 'b_group', 'r_group', 'w_group', 'streets', 'intersects','profile']
grtr_10dtc=grtr_10.groupby(second, as_index=False).agg({unit_label:"sum", "quantity":"sum"})
# adding the survey total of all objects to each record
grtr_10dtc['dt']= grtr_10dtc.loc_date.map(lambda x:fd_dq.loc[[x], 'quantity'][0])

# calculating the % total of contributed and distributed at each survey
grtr_10dtc['pt']= grtr_10dtc.quantity/grtr_10dtc.dt

# the_dists = grtr_10dtc[grtr_10dtc.group == 'DG']
# the_conts = grtr_10dtc[grtr_10dtc.group == 'CG']

less_than = grtr_10dtc[(grtr_10dtc['rural'] == 'rural')].location.unique()
grt_than = grtr_10dtc[(grtr_10dtc['rural'] == 'urban')].location.unique()
grt_dtr = grtr_10dtc.groupby(['loc_date', 'date','rural'], as_index=False)[unit_label].agg({unit_label:"sum"})
# months locator, can be confusing
# https://matplotlib.org/stable/api/dates_api.html
months = mdates.MonthLocator(interval=1)
months_fmt = mdates.DateFormatter('%b')
days = mdates.DayLocator(interval=7)
fig, axs = plt.subplots(1,2, figsize=(11,6), sharey=True)

group_palette = {'CG':'magenta', 'DG':'teal', 'other':'tan'}
rural_palette = {'rural':'black', 'urban':'salmon' }

ax = axs[0]
sns.scatterplot(data=grt_dtr, x='date', y=unit_label, hue='rural', s=80, palette=rural_palette, alpha=0.6, ax=ax)

ax.set_ylim(0,grt_dtr[unit_label].quantile(.98)+50 )

ax.set_xlabel("")
ax.set_ylabel(unit_label, **ck.xlab_k14)

ax.xaxis.set_minor_locator(days)
ax.xaxis.set_major_formatter(months_fmt)

axtwo = axs[1]

box_props = {
    'boxprops':{'facecolor':'none', 'edgecolor':'black'},
    'medianprops':{'color':'black'},
    'whiskerprops':{'color':'black'},
    'capprops':{'color':'black'}
}

sns.boxplot(data=grt_dtr, x='rural', y=unit_label, dodge=False, showfliers=False, ax=axtwo, **box_props)
sns.stripplot(data=grtr_10dtc,x='rural', y=unit_label, ax=axtwo, zorder=1, hue='group', palette=group_palette, jitter=.35, alpha=0.3, s=8)
axtwo.set_ylabel(unit_label, **ck.xlab_k14)

ax.tick_params(which='both', axis='both', labelsize=14)
axtwo.tick_params(which='both', axis='both', labelsize=14)
axtwo.set_xlabel(" ")

plt.tight_layout()
plt.show()
plt.close()
../_images/assessing_transport_18_0.png

Summary data all survey totals

change_names = {'count':'# samples', 
                'mean':F"average {unit_label}",
                'std':'standard deviation', 
                'min p/50m':'min', '25%':'25%',
                '50%':'50%', '75%':'75%',
                'max':F"max {unit_label}", 'min':F"min {unit_label}",
                'total objects':'total objects',
                '# locations':'# locations',
                'survey year':'survey year'
               }

# convenience function to change the index names in a series
def anew_dict(x):
    new_dict = {}
    for param in x.index:
        new_dict.update({change_names[param]:x[param]})
    return new_dict  

# select data
data = grt_dtr

# get the basic statistics from pd.describe
desc_2020 = data.groupby('rural')[unit_label].describe()
desc_2020.loc['all surveys', ['count', 'mean', 'std', 'min', '25%', '50%', '75%', 'max']] = grt_dtr.groupby(['loc_date', 'date'])[unit_label].sum().describe().to_numpy()
desc = desc_2020.astype('int')
desc = desc.applymap(lambda x: F"{x:,}")
desc.rename(columns={'count':'samples'}, inplace= True)
desc.reset_index(inplace=True)

# make tables
fig, axs = plt.subplots(figsize=(7,3.4))

# summary table
# names for the table columns
a_col = [top_name[0], 'total']

axone = axs
ut.hide_spines_ticks_grids(axone)

a_table = axone.table(cellText=desc.values,  colLabels=desc.columns, colWidths=[.19,*[.1]*8], loc='lower center', bbox=[0,0,1,1])
the_material_table_data = sut.make_a_summary_table(a_table,desc.values,desc.columns, s_et_bottom_row=False)


plt.tight_layout()
plt.subplots_adjust(wspace=0.2)
plt.show()
../_images/assessing_transport_20_0.png

6.4. Differences between urban and rural survey totals

Survey results in rural locations had a lower median and mean than urban locations and all locations combined. The maximum and the minimum values as well as the highest standard deviation were recorded at rural locations.

6.4.1. Confidence intervals (CIs) of the median survey results

The upper 95% of the median survey results from rural location is less than the 2.5% of the median survey result of both urban and combined survey results.

# this code was modified from this source:
# http://bebi103.caltech.edu.s3-website-us-east-1.amazonaws.com/2019a/content/recitations/bootstrapping.html
# if you want to get the confidence interval around another point estimate use np.percentile
# and add the percentile value as a parameter

def draw_bs_sample(data):
    """Draw a bootstrap sample from a 1D data set."""
    return np.random.choice(data, size=len(data))

def compute_jackknife_reps(data, statfunction=None, stat_param=False):
    '''Returns jackknife resampled replicates for the given data and statistical function'''
    # Set up empty array to store jackknife replicates
    jack_reps = np.empty(len(data))

    # For each observation in the dataset, compute the statistical function on the sample
    # with that observation removed
    for i in range(len(data)):
        jack_sample = np.delete(data, i)
        if not stat_param:
            jack_reps[i] = statfunction(jack_sample)
        else:
            jack_reps[i] = statfunction(jack_sample, stat_param)          
        
    return jack_reps


def compute_a(jack_reps):
    '''Returns the acceleration constant a'''

    mean = np.mean(jack_reps)
    try:
        a = sum([(x**-(i+1)- (mean**-(i+1)))**3 for i,x in enumerate(jack_reps)])
        b = sum([(x**-(i+1)-mean-(i+1))**2 for i,x in enumerate(jack_reps)])
        c = 6*(b**(3/2))
        data = a/c
    except:
        print(mean)
    return data


def bootstrap_replicates(data, n_reps=1000, statfunction=None, stat_param=False):
    '''Computes n_reps number of bootstrap replicates for given data and statistical function'''
    boot_reps = np.empty(n_reps)
    for i in range(n_reps):
        if not stat_param:
            boot_reps[i] = statfunction(draw_bs_sample(data))
        else:
            boot_reps[i] = statfunction(draw_bs_sample(data), stat_param)     
        
    return boot_reps


def compute_z0(data, boot_reps, statfunction=None, stat_param=False):
    '''Computes z0 for given data and statistical function'''
    if not stat_param:
        s = statfunction(data)
    else:
        s = statfunction(data, stat_param)
    return stats.norm.ppf(np.sum(boot_reps < s) / len(boot_reps))


def compute_bca_ci(data, alpha_level, n_reps=1000, statfunction=None, stat_param=False):
    '''Returns BCa confidence interval for given data at given alpha level'''
    # Compute bootstrap and jackknife replicates
    boot_reps = bootstrap_replicates(data, n_reps, statfunction=statfunction, stat_param=stat_param)
    jack_reps = compute_jackknife_reps(data, statfunction=statfunction, stat_param=stat_param)

    # Compute a and z0
    a = compute_a(jack_reps)
    z0 = compute_z0(data, boot_reps, statfunction=statfunction, stat_param=stat_param)

    # Compute confidence interval indices
    alphas = np.array([alpha_level/2., 1-alpha_level/2.])
    zs = z0 + stats.norm.ppf(alphas).reshape(alphas.shape+(1,)*z0.ndim)
    avals = stats.norm.cdf(z0 + zs/(1-a*zs))
    ints = np.round((len(boot_reps)-1)*avals)
    ints = np.nan_to_num(ints).astype('int')

    # Compute confidence interval
    boot_reps = np.sort(boot_reps)
    ci_low = boot_reps[ints[0]]
    ci_high = boot_reps[ints[1]]
    return (ci_low, ci_high)


the_bcas = {}

an_int = 50

# rural cis
r_median = grt_dtr[grt_dtr.rural == 'rural'][unit_label].median()
a_result = compute_bca_ci(grt_dtr[grt_dtr.rural == 'rural'][unit_label].to_numpy(), .05, n_reps=5000, statfunction=np.percentile, stat_param=an_int)
r_cis = {'rural':{"lower 2.5%":a_result[0], "median":r_median, "upper 97.5%": a_result[1] }}
the_bcas.update(r_cis)
# urban cis
u_median = grt_dtr[grt_dtr.rural == 'urban'][unit_label].median()
a_result = compute_bca_ci(grt_dtr[grt_dtr.rural == 'urban'][unit_label].to_numpy(), .05, n_reps=5000, statfunction=np.percentile, stat_param=an_int)
u_cis = {'urban':{"lower 2.5%":a_result[0], "median":u_median, "upper 97.5%": a_result[1] }}
the_bcas.update(u_cis)
# all surveys
u_median = grt_dtr[unit_label].median()
a_result = compute_bca_ci(grt_dtr[unit_label].to_numpy(), .05, n_reps=5000, statfunction=np.percentile, stat_param=an_int)
all_cis = {'all surveys':{"lower 2.5%":a_result[0], "median":u_median, "upper 97.5%": a_result[1] }}

# combine the results:
the_bcas.update(all_cis)

# make a df
the_cis = pd.DataFrame(the_bcas)
the_cis.reset_index(inplace=True)

fig, axs = plt.subplots()

data = the_cis.values
collabels = the_cis.columns
ut.hide_spines_ticks_grids(axs)

the_first_table_data = axs.table(data, colLabels=collabels, colWidths=[*[.2]*5], bbox=[0, 0, 1, 1])

a_summary_table_one = sut.make_a_summary_table(the_first_table_data,data,collabels, a_color, s_et_bottom_row=True)

a_summary_table_one.get_celld()[(0,0)].get_text().set_text(" ")

plt.show()
plt.close()
../_images/assessing_transport_22_0.png

The 95% confidence interval of survey totals under urban and rural land use conditions

The initial observation that the median survey result in urban locations is greater than the other locations is supported by the result of 95% CIs. The lower end of the urban ci is greater than the upper end of both rural survey locations and all survey locations.

6.5. Survey results of CG and DG with respect to land use

Recall that the most common objects were grouped according to the value of the Spearmans ranked correlation test of p/100m with respect to the land use profile, (annex 1). Creating two groups, one group of objects that has a positive association with % of land attributed to buildings and one that does not. Cotton swabs are included with the objects that have few or no associations with land use, because they are normally expelled by water treatment plants directly into the nearest body of water.

The objects with no or few positive associations to land use characteristics (DG):

  1. Cotton swabs

  2. Expanded polystyrene foams > 5mm

  3. Extruded foams

  4. Industrial sheeting

  5. Expanded foams < 5mm

  6. Industrial pellets

  7. Plastic construction waste

  8. Fragmented plastics

The objects with multiple positive associations to land use characteristics (CG):

  1. Cigarette ends

  2. Snack wrappers

  3. Glass bottles and pieces

  4. Metal bottle tops

6.5.1. Assessment of composition

The ratio of DG to CG in the rural group was 1.7, in the urban group it was 0.81. On a per survey basis, DG were a greater percent of the total in all surveys from rural locations. In urban locations the ratio of DG to CG is the inverse of the rural locations and for approximately 20% of the surveys in urban locations the ratio of DG to CG is very close to 1.

Sample results from rural locations had a greater portion of trash attributed to fragmented plastics, construction plastics and foams.

# combine the two object groups into one data frame
DG = "DG"
CG = "CG"

dists = grtr_10dtc[(grtr_10dtc.group == DG)][['loc_date', 'location', unit_label]].set_index('loc_date')
conts = grtr_10dtc[(grtr_10dtc.group == CG)][['loc_date', 'location',  unit_label]].set_index('loc_date')
conts.rename(columns={unit_label:CG}, inplace=True)
dists.rename(columns={unit_label:DG}, inplace=True)
c_v_d = pd.concat([dists, conts], axis=0)
c_v_d['dt'] = c_v_d[DG]/c_v_d[CG]

# the ratio of dist to cont under the two land use conditions
ratio_of_d_c_agg = c_v_d[c_v_d.location.isin(less_than)][DG].sum()/c_v_d[c_v_d.location.isin(less_than)][CG].sum()
ratio_of_d_c_urb= c_v_d[c_v_d.location.isin(grt_than)][DG].sum()/c_v_d[c_v_d.location.isin(grt_than)][CG].sum()

# chart that
fig, ax = plt.subplots(figsize=(6,5))

# get the ecdf of percent of total for each object group under each condition
# p of t urban
co_agecdf = ECDF(grtr_10dtc[(grtr_10dtc.location.isin(grt_than))&(grtr_10dtc.group.isin([CG]))]["pt"])
di_agecdf = ECDF(grtr_10dtc[(grtr_10dtc.location.isin(grt_than))&(grtr_10dtc.group.isin([DG]))]["pt"])

# p of t rural
cont_ecdf = ECDF(grtr_10dtc[(grtr_10dtc.location.isin(less_than))&(grtr_10dtc.group.isin([CG]))]["pt"])
dist_ecdf = ECDF(grtr_10dtc[(grtr_10dtc.location.isin(less_than))&(grtr_10dtc.group.isin([DG]))]["pt"])

sns.lineplot(x=cont_ecdf.x, y=cont_ecdf.y, color='salmon', label="rural: contributed", ax=ax)
sns.lineplot(x=co_agecdf.x, y=co_agecdf.y, color='magenta', ax=ax, label="urban: contributed")
sns.lineplot(x=dist_ecdf.x, y=dist_ecdf.y, color='teal', label="rural: distributed", ax=ax)
sns.lineplot(x=di_agecdf.x, y=di_agecdf.y, color='black', label="urban: distributed", ax=ax)

ax.set_xlabel("% of survey total", **ck.xlab_k14)
ax.set_ylabel("% of surveys", **ck.xlab_k14)
plt.legend(loc='lower right', title="% of total")

plt.show()
../_images/assessing_transport_26_0.png

Difference in composition of rural and urban litter surveys

Samples with a higher percentage of distributed objects and lower percentage of contributed objects were more likely in rural environments

Under different land use conditions 9/10 of the most common objects are the same in rural and urban settings. There are two exceptions:

  • plastic construction waste made the top ten in rural settings

  • industrial pellets made the top ten in urban settings

The recorded quantities of cigarette ends, glass bottles and snack wrappers were greater under urban conditions.

The most common objects under different land use profiles. Left: rural, right: urban

rur_10 = grtr_10[grtr_10.location.isin(less_than)&(grtr_10.code.isin(most_common))].groupby('code', as_index=False).quantity.sum().sort_values(by='quantity', ascending=False)
urb_10 = grtr_10[grtr_10.location.isin(grt_than)&(grtr_10.code.isin(most_common))].groupby('code', as_index=False).quantity.sum().sort_values(by='quantity', ascending=False)

rur_tot = grtr_10[grtr_10.location.isin(less_than)].quantity.sum()
urb_tot = grtr_10[grtr_10.location.isin(grt_than)].quantity.sum()

rur_10['item'] = rur_10.code.map(lambda x: code_description_map.loc[x])
urb_10['item'] = urb_10.code.map(lambda x: code_description_map.loc[x])

rur_10["% of total"] = ((rur_10.quantity/rur_tot)*100).round(1)
urb_10["% of total"] = ((urb_10.quantity/urb_tot)*100).round(1)

# make tables
fig, axs = plt.subplots(1, 2, figsize=(10,len(most_common)*.8))

# summary table
# names for the table columns
a_col = [top_name[0], 'total']

axone = axs[0]
axtwo = axs[1]

ut.hide_spines_ticks_grids(axone)
ut.hide_spines_ticks_grids(axtwo)

data_one = rur_10[['item', 'quantity', "% of total"]].copy()
data_two = urb_10[['item', 'quantity', "% of total"]].copy()

for a_df in [data_one, data_two]:
    a_df['quantity'] = a_df.quantity.map(lambda x: F"{x:,}")

a_table = axone.table(cellText=data_one.values,  colLabels=data_one.columns, colWidths=[.6,*[.2]*2], loc='lower center', bbox=[0,0,1,1])
the_material_table_data = sut.make_a_summary_table(a_table,data_one.values,data_one.columns, s_et_bottom_row=True)

a_table = axtwo.table(cellText=data_two.values,  colLabels=data_one.columns, colWidths=[.6,*[.2]*2], loc='lower center', bbox=[0,0,1,1])
the_material_table_data = sut.make_a_summary_table(a_table,data_one.values,data_one.columns, s_et_bottom_row=True)

axone.set_xlabel("rural", **ck.xlab_k14)
axtwo.set_xlabel("urban", **ck.xlab_k14)
plt.tight_layout()
plt.subplots_adjust(wspace=0.2)
plt.show()
../_images/assessing_transport_30_0.png

6.5.2. Distribution of survey totals

The survey results of the DG are very similar under both land use classes, there is more variance as the reported value increases but not so much that the distributions diverge. Given the standard deviation of the samples and the high variance of beach-litter-survey data in general this is expected. [HG19]

The two sample Kolmogorov-Smirnov(KS) test(ks=0.073, p=0.808) of the two sets of survey results suggest that the survey results of DG may not be significantly different between the two land use classes. The results from the Mann-Whitney U (MWU) (U=11445.0, p=0.762) *suggest that it is possible that the two distributions are the same.**[sca] [scb]

Empirical Cumulative Distribution (ECDF) of the survey results of DG and CG objects under the different land use classes

left: DG , right: CG

dist_results_agg = grtr_10dtc[(grtr_10dtc.location.isin(less_than))&(grtr_10dtc.group == DG)].groupby(['loc_date', 'group'])[unit_label].sum().values
dist_results_urb = grtr_10dtc[(grtr_10dtc.location.isin(grt_than))&(grtr_10dtc.group == DG)].groupby(['loc_date', 'group'])[unit_label].sum().values
a_d_ecdf = ECDF(dist_results_agg )
b_d_ecdf = ECDF(dist_results_urb )

cont_results_agg = grtr_10dtc[(grtr_10dtc.location.isin(less_than))&(grtr_10dtc.group == CG)].groupby('loc_date')[unit_label].sum().values
cont_results_urb = grtr_10dtc[(grtr_10dtc.location.isin(grt_than))&(grtr_10dtc.group == CG)].groupby('loc_date')[unit_label].sum().values
a_d_ecdf_cont = ECDF(cont_results_agg)
b_d_ecdf_cont = ECDF(cont_results_urb)

fig, ax = plt.subplots(1,2, figsize=(8,5))

axone = ax[0]
sns.lineplot(x=a_d_ecdf.x, y=a_d_ecdf.y, color='salmon', label="rural", ax=axone)
sns.lineplot(x=b_d_ecdf.x, y=b_d_ecdf.y, color='black', label="urban", ax=axone)
axone.set_xlabel(unit_label, **ck.xlab_k14)
axone.set_ylabel('% of surveys', **ck.xlab_k14)
axone.legend(fontsize=12, title=DG,title_fontsize=14)


axtwo = ax[1]
sns.lineplot(x=a_d_ecdf_cont.x, y=a_d_ecdf_cont.y, color='salmon', label="rural", ax=axtwo)
sns.lineplot(x=b_d_ecdf_cont.x, y=b_d_ecdf_cont.y, color='black', label="urban", ax=axtwo)

axtwo.set_xlabel(unit_label, **ck.xlab_k14)
axtwo.set_ylabel(' ')
axtwo.legend(fontsize=12, title=CG,title_fontsize=14)

plt.show()
../_images/assessing_transport_33_0.png

On the other hand the CG survey results diverge almost immediately according to the land use class. This test results supports the decision to reject the null hypothesis of the Spearmans ranked correlation test for this group of codes and land use profile. The rural locations have less buildings and more agriculture or woods. The two conditions together should reduce the amount of tobacco and food wrappers for that class.

The KS and MWU tests both agree with the visual results that rural and urban CG survey results most likely come from different distributions and they are not equal (ks=0.284, pvalue<.0001), (U=7559.0, pvalue<.0001).

6.5.2.1. Difference of means

The average survey result of DG objects in rural locations was 139.86p/100m in urban locations and 117.75p/100m. A permutation test on the difference of means was conducted on the condition rural - urban.

Difference of means DG objects. \(\mu_{rural}\) - \(\mu_{urban}\), method=shuffle, permutations=5000.

# print(stats.ks_2samp(dist_results_agg, dist_results_urb, alternative='two-sided', mode='auto'))
# print(stats.mannwhitneyu(dist_results_agg,dist_results_urb, alternative='two-sided'))

# #the results for contributed objects
# print(stats.ks_2samp(cont_results_agg, cont_results_urb, alternative='two-sided', mode='auto'))
# print(stats.mannwhitneyu(cont_results_agg, cont_results_urb, alternative='two-sided'))

# pemutation test: of difference of means HD objects
agg_dobj = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.group == DG)].groupby(['loc_date'], as_index=False)[unit_label].sum()
buld_obj = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.group == DG)].groupby(['loc_date'], as_index=False)[unit_label].sum()
# label the urban and rural results
agg_dobj['class'] = 'rural'
buld_obj['class'] = 'urban'

# merge into one 
objs_merged = agg_dobj.append(buld_obj)

# store the mean per class
the_mean = objs_merged.groupby('class')[unit_label].mean()

# store the difference
mean_diff = the_mean.loc['rural'] - the_mean.loc['urban']
new_means=[]
# permutation resampling:
for element in np.arange(5000):
    objs_merged['new_class'] = objs_merged['class'].sample(frac=1).values
    b=objs_merged.groupby('new_class').mean()
    new_means.append((b.loc['rural'] - b.loc['urban']).values[0])
emp_p = np.count_nonzero(new_means <= (the_mean.loc['rural'] - the_mean.loc['urban'])) / 5000

# chart the results
fig, ax = plt.subplots()

sns.histplot(new_means, ax=ax)
ax.set_title(F"$\u0394\mu$ = {np.round(mean_diff, 2)}, perm=5000, p={emp_p} ", **ck.title_k14)
ax.set_ylabel('permutations', **ck.xlab_k14)
ax.set_xlabel("$\mu$ rural - $\mu$ urban", **ck.xlab_k14)
plt.show()
../_images/assessing_transport_36_0.png

Refuse to reject the null hypothesis that these two distributions may be the same

The test of the difference of medians supports the observations that DG survey results occur at similar rates indifferent of the land use profile.

6.5.3. Seasonal variations

Seasonal variations of beach litter survey results has been documented under many conditions and many environments. In 2018 the SLR [Bla18] reported the maximum value in July and the minimum in November. The year 2020-2021 presents the same results.

# the survey results to test
corr_data = grtr_10[(grtr_10.code.isin(most_common))].copy()
results_sprmns = {}
for i,code in enumerate(most_common):
    data = corr_data[corr_data.code == code]
    for j, n in enumerate(luse_exp):
        corr, a_p = stats.spearmanr(data[n], data[unit_label])
        results_sprmns.update({code:{"rho":corr, 'p':a_p}})

# helper dict for converting ints to months
months={
    0:'Jan',
    1:'Feb',
    2:'Mar',
    3:'Apr',
    4:'May',
    5:'Jun',
    6:'Jul',
    7:'Aug',
    8:'Sep',
    9:'Oct',
    10:'Nov',
    11:'Dec'
}

m_dt = grtr_10.groupby(['loc_date', 'date','group'], as_index=False).agg({'quantity':'sum', unit_label:'sum'})

# sample totals all objects
m_dt_t = grtr_10.groupby(['loc_date','date','month', 'eom'], as_index=False).agg({unit_label:'sum'})
m_dt_t.set_index('date', inplace=True)

# data montlhy median survey results contributed, distributed and survey total
fxi=m_dt.set_index('date', drop=True)
data1 = fxi[fxi.group == CG][unit_label].resample("M").mean()
data2 = fxi[fxi.group == DG][unit_label].resample("M").mean()

# data3 = m_dt_t[unit_label].resample("M").mean()

# helper tool for months in integer order
def new_month(x):
    if x <= 11:
        this_month = x
    else:
        this_month=x-12    
    return this_month


# the monthly average discharge rate of the three rivers where the majority of the samples are
aare_schonau = [61.9, 53, 61.5, 105, 161, 155, 295, 244, 150, 106, 93, 55.2, 74.6, 100, 73.6, 92.1]
rhone_scex =   [152, 144, 146, 155, 253, 277, 317, 291, 193, 158, 137, 129, 150, 146, 121, 107]
linth_weesen = [25.3, 50.7, 40.3, 44.3, 64.5, 63.2, 66.2, 61.5, 55.9, 52.5, 35.2, 30.5, 26.1, 42.0, 36.9]

fig, ax = plt.subplots()
    
this_x = [i for i,x in  enumerate(data1.index)]
this_month = [x.month for i,x in enumerate(data1.index)]

twin_ax = ax.twinx()
twin_ax.grid(None)

ax.bar(this_x, data1.to_numpy(), label='contributed', bottom=data2.to_numpy(), linewidth=1, color="salmon", alpha=0.6)
ax.bar([i for i,x in  enumerate(data2.index)], data2.to_numpy(), label='distributed', linewidth=1,color="darkslategray", alpha=0.6)

sns.scatterplot(x=this_x,y=[*aare_schonau[2:], np.mean(aare_schonau)], color='turquoise',  edgecolor='magenta', linewidth=1, s=60, label='Aare m³/s', ax=twin_ax)
sns.scatterplot(x=this_x,y=[*rhone_scex[2:], np.mean(rhone_scex)], color='royalblue',  edgecolor='magenta', linewidth=1, s=60, label='Rhône m³/s', ax=twin_ax)
sns.scatterplot(x=this_x,y=[*linth_weesen[2:], np.mean(linth_weesen), np.mean(linth_weesen)], color='orange', edgecolor='magenta', linewidth=1, s=60, label='Linth m³/s', ax=twin_ax)
handles, labels = ax.get_legend_handles_labels()
handles2, labels2 = twin_ax.get_legend_handles_labels()
ax.xaxis.set_major_locator(ticker.FixedLocator([i for i in np.arange(len(this_x))]))

ax.set_ylabel(unit_label, **ck.xlab_k14)
twin_ax.set_ylabel("m³/sec", **ck.xlab_k14)

axisticks = ax.get_xticks()
labelsx = [months[new_month(x-1)] for x in  this_month]

plt.xticks(ticks=axisticks, labels=labelsx)
plt.legend([*handles, *handles2], [*labels, *labels2], bbox_to_anchor=(0,-.1), loc='upper left', fontsize=14)

plt.show()
../_images/assessing_transport_40_0.png

monthly survey results and river discharge rates m³/second

April and May 2021 are rolling averages, data not available

source : https://www.hydrodaten.admin.ch/en/stations-and-data.html?entrance_medium=menu

6.6. The survey results of FP and FT with respect to land use

Results of KS test and Mann Whitney U

The survey results for FP objects is very similar up to \(\approx\) the 85th percentile where the rural survey results are noticeably larger. Suggesting that extreme values for FP were more likely in rural locations. According to the KS test (ks=0.092, pvalue=0.50) and MWU test (U=11592.0, pvalue=0.62) the distribution of FP objects under the two land use classes is not significantly different and may be equal.

The survey results for FT objects maintain the same features as the parent distribution. The results of the KS test (ks=0.29, pvalue<.001) and MWU test (U=7647.5, p<.001) agree with the results of the parent group, that there is a statistically relevant difference between the survey results under different land use classes.

Left rural - urban: ECDF of survey results fragmented plastics and foams (FP)

Right rural - urban: ECDF of survey results cigarette ends and candy wrappers (FT)

agg_dobj = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.code.isin(['Gfrags', 'Gfoam']))].groupby(['loc_date'])[unit_label].sum().values
buld_obj = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.code.isin(['Gfrags', 'Gfoam']))].groupby(['loc_date'])[unit_label].sum().values

a_d_ecdf = ECDF(agg_dobj)
b_d_ecdf = ECDF(buld_obj)

# print(len(a_d_ecdf.x))
# print(len(b_d_ecdf.x))
# print(a_d_ecdf(17), b_d_ecdf(15))
# print(np.quantile(a_d_ecdf.x,.65))
# print(np.quantile(b_d_ecdf.x, .65))

agg_cont = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'])[unit_label].sum().values
b_cont = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'])[unit_label].sum().values

a_c_ecdf = ECDF(agg_cont)
b_c_ecdf = ECDF(b_cont)

fig, ax = plt.subplots(1,2, figsize=(8,5))

axone = ax[0]
sns.lineplot(x=a_d_ecdf.x, y=a_d_ecdf.y, color='salmon', label="rural", ax=axone)
sns.lineplot(x=b_d_ecdf.x, y=b_d_ecdf.y, color='black', label="urban", ax=axone)
axone.set_xlabel(unit_label, **ck.xlab_k14)
axone.set_ylabel('% of surveys', **ck.xlab_k14)

axone.legend(fontsize=12, title='FP',title_fontsize=14)



axtwo = ax[1]
sns.lineplot(x=a_c_ecdf.x, y=a_c_ecdf.y, color='salmon', label="rural", ax=axtwo)
sns.lineplot(x=b_c_ecdf.x, y=b_c_ecdf.y, color='black', label="urban", ax=axtwo)

axtwo.set_xlabel(unit_label, **ck.xlab_k14)
axtwo.set_ylabel(' ')

# ax.legend(fontsize=12, title='TITLE (fs=30)',title_fontsize=30)

axtwo.legend(fontsize=12, title='FT',title_fontsize=14)
# axtwo.get_legend().set_title("Fragmented plastics")

plt.tight_layout()

plt.show()
../_images/assessing_transport_44_0.png

6.6.1. FP and FT difference of means.

The average survey result of FP objects in rural locations was 22.93p/50m in urban locations it was 12p/50m. A permutation test on the difference of means was conducted on the condition rural - urban.

Difference of means fragmented foams and plastics under the two different land use classes.

\(\mu_{rural}\) - \(\mu_{urban}\), method=shuffle, permutations=5000

# pemutation test: of difference of means FP objects
agg_dobj = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.code.isin(['Gfrags', 'G89']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
buld_obj = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.code.isin(['Gfrags', 'G89']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
# label the urban and rural results
agg_dobj['class'] = 'rural'
buld_obj['class'] = 'urban'

# merge into one 
objs_merged = agg_dobj.append(buld_obj)

# store the mean per class
the_mean = objs_merged.groupby('class')[unit_label].mean()

# store the difference
mean_diff = the_mean.loc['rural'] - the_mean.loc['urban']
new_means=[]

# permutation resampling:
for element in np.arange(5000):
    objs_merged['new_class'] = objs_merged['class'].sample(frac=1).values
    b=objs_merged.groupby('new_class').mean()
    new_means.append((b.loc['rural'] - b.loc['urban']).values[0])
emp_p = np.count_nonzero(new_means <= (the_mean.loc['rural'] - the_mean.loc['urban'])) / 5000

# chart the results
fig, ax = plt.subplots()

sns.histplot(new_means, ax=ax)
ax.set_title(F"$\u0394\mu$ = {np.round(mean_diff, 2)}, perm=5000, p={emp_p} ", **ck.title_k14)
ax.set_ylabel('permutations', **ck.xlab_k14)
ax.set_xlabel("$\mu$ rural - $\mu$ urban", **ck.xlab_k14)
plt.show()
../_images/assessing_transport_48_0.png

Refuse to reject the null hypotheses: there is no statistically significant difference between the two distributions

Difference of means cigarette ends and snack wrappers under the two different land use classes.

\(\mu_{rural}\) - \(\mu_{urban}\), urban method=shuffle, permutations=5000

# pemutation test: of difference of means food objects
agg_cont = grtr_10[(grtr_10.location.isin(less_than))&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
b_cont = grtr_10[(grtr_10.location.isin(grt_than))&(grtr_10.code.isin(['G27', 'G30']))].groupby(['loc_date'], as_index=False)[unit_label].sum()
# label the urban and rural results
agg_cont['class'] = 'rural'
b_cont['class'] = 'urban'

# merge into one 
objs_merged = agg_cont.append(b_cont)

# store the mean per class
the_mean = objs_merged.groupby('class')[unit_label].mean()

# store the difference
mean_diff = the_mean.loc['rural'] - the_mean.loc['urban']

# permutation resampling:
for element in np.arange(5000):
    objs_merged['new_class'] = objs_merged['class'].sample(frac=1).values
    b=objs_merged.groupby('new_class').mean()
    new_means.append((b.loc['rural'] - b.loc['urban']).values[0])
emp_p = np.count_nonzero(new_means <= (the_mean.loc['rural'] - the_mean.loc['urban'])) / 5000

# chart the results
fig, ax = plt.subplots()

sns.histplot(new_means, ax=ax)
ax.set_title(F"$\u0394\mu$ = {np.round(mean_diff, 2)}, perm=5000, p={emp_p} ", **ck.title_k14)
ax.set_ylabel('permutations', **ck.xlab_k14)
ax.set_xlabel("$\mu$ rural - $\mu$ urban", **ck.xlab_k14)
plt.show()
../_images/assessing_transport_51_0.png

Reject the null hypothesis: the two distributions are most likely not the same

6.7. Discussion

The results of the test show that there is no statistical difference of the distribution of survey results for DG objects under the two land use conditions. Therefore the null hypothesis can not be rejected and we assume that the distribution of these objects \(\approx\) the same under all land use classes.

In contrast, the CG objects that have diverging rates depending on the land use profile and a corresponding positive association to the % of land attributed to buildings. The null hypothesis is therefore rejected for CG objects and we conclude that there is a statistically significant difference under the two land use classes.

This information can be used to make logical groups of responsibility that are centered around objects and their propensity to accumulate under certain conditions.

Establishing objective criteria. Identifying and counting the objects after a litter survey

output = io.BytesIO()

this_image = PILImage.open("resources/codegroups/images/takingnotes.jpg")
this_image.thumbnail((800, 1200))
this_image.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()

html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)

The data is then entered into the application by smartphone or pc

6.7.1. Collecting objective criteria

The requirement of objective criteria is a way of ensuring that the principle of causality is applied appropriately. The survey results from IQAASL and SLR both provide the basic elements of objective criteria:

  1. The gps coordinates where the objects were found

  2. The date the objects were found

  3. The quantity of objects found

  4. A description of the object and its uses in the economy

  5. The material type of the object

  6. The dimensions of the area surveyed

  7. The GMDE number of the municipality

The survey results can then be put into context with municipal data about the economic and geographic situation surrounding the survey location, as was done in this example.

There are limits to the information that can be extracted from beach litter surveys given the current protocol. These limits are related to the costs to complete one survey and the resulting number of samples per sampling period. It is best to establish need and consider all the factors before changing or adding to the current protocol:

  1. What question does it answer?

  2. Does it introduce bias?

  3. Is there a secondary use for the data?

  4. Do these measures apply to all survey areas?

  5. How easy are the new measures to repeat?

  6. What is the financial burden?

Care must be taken to maintain objectivity throughout the process. The best way to estimate the current state is having as many objective samples as possible within the area of study. Currently the protocol allows for a proven, repeatable, broad based assessment with minimal financial burden and almost no fixed costs. Anything that changes these basic conditions should be avoided.

6.7.2. Creating partnerships

Assigning secondary responsibility should be perceived and constructed as a partnership. A partnership between consumers and producers to reduce the incidence of these objects in the environment.

6.7.2.1. Set clear goals

The methods used to establish objective criteria should also be used in all benchmark and baseline calculations. Having benchmarks and goals are essential in establishing return on investment:

To demonstrate a return on an investment their needs to be a baseline measurement, clear benchmarks and goals that have both a temporal and spatial definition.

The system of measurement needs to be agreed upon and based on objective, recognized numeric measurements that do not require any special equipment

The second point is essential. Stakeholders can train and learn the data collection methods that pertain to the baseline values with very little investment. Thus it allows for a broad based assessment of the attainment of defined benchmarks and goals.

6.7.2.2. Finding partners

The results from the test indicate that CG objects are more prevalent in urban locations. Urban was defined as the land use within 1500m of the survey area. From this it is safe to assume that the cause(s) of CG group litter are also more prevalent in urban areas and that the secondary cause of the litter is within 1500m of the survey location.

Stakeholders looking to reduce the incidence of CG objects within a specific zone may have a better chance of finding motivated partners within 1500m of the location of concern.

The DG group has the particularity that it is distributed in \(\approx\) rates indifferent of the land use and it makes up a larger proportion of the objects found than CG. This implies that the solution is at a larger scale than the municipal boundaries.

Fragmented plastics is the DG only object on the list that cannot be attributed to at least one industry that is present in all the survey areas covered by this analysis.

  • Expanded polystyrene is used as an exterior insulation envelope in the construction industry and is also used in packaging to protect fragile objects during transport.

  • Plastic production pellets are used to make plastic objects in the injection molding process.

  • Cotton swabs are often diverted to rivers and lakes after passing through a water treatment plant.

  • Industrial sheeting is used in horticulture, transport and the construction industry.

  • Construction plastics

Finding partners for these objects may involve an initial phase of informative targeted communication that calls attention to the study results, and the previous results from the SLR. The method of publication of this article makes it very easy to share objective criteria. It is essential to maximize pre-existing relationships at the regional level to develop meaningful communication strategies and clear goals and benchmarks.

6.7.2.3. Individual items

In cases such as the plastic production pellet (GPI), the use case of the item is definite and users and producers are relatively rare with respect to other litter items. Therefore the origin can be determined by searching for consumers of that product within 1500m of the survey location and then progressing upstream. This can be done by locating the survey locations on a map and then overlaying the locations of likely consumers or producers of the object in question.

A causal relationship could be inferred from the image below but not assumed. GPI are small and difficult to clean up once they have been spilled making the exact source impossible to determine. However it is reasonable to assume that the handlers and consumers of GPI will have the best ideas on how to prevent them from escaping into the environment.

output = io.BytesIO()
bassin_map = PILImage.open("resources/images/causality.jpeg")
bassin_map.thumbnail((800, 1200))
bassin_map.save(output, format='PNG')
encoded_string = base64.b64encode(output.getvalue()).decode()

html = '<img src="data:image/png;base64,{}"/>'.format(encoded_string)
IPython.display.HTML(html)

Identifying secondary sources of specific litter items. Consumers or handlers of plastic production pellets and probable fluvial route to survey location. Venoge and Thiele rivers.

Enterprise names withheld

6.7.2.4. Sharing the responsibility

The principle of Extended Producer Responsibility (EPR) may provide the incentive to producers and consumers to account for the real costs of end-of-life management for the most common litter items identified in Switzerland. [Pou20]

A recent study in the Marine Policy journal identified several limitations of using preexisting beach litter survey data to assess the impact of EPR policy on observed litter quantities. [HLCM21]

  1. Limited data

  2. Heterogeneous methods

  3. Data not collected for the purposes of evaluating ERP

To correct these limitations the authors provide the following recommendations:

  1. Create a data framework specifically for monitoring ERP targets

  2. Identify sources

  3. Use litter counts to establish baselines

  4. Frequent monitoring

Litter counts mitigate the effects of light-weight packaging when collection results are based on weights. [HLCM21]

The IQAASl project addresses three out of the four recommendations and it introduced a method that allows stakeholders to add specific items to the survey protocol. Thus monitoring progress towards ERP targets on a national scale is as simple as defining the characteristics of the objects and creating an entry in the object database.

The issues of heterogeneous methods was solved by ensuring that the data collected in IQAASL was compatible with the data collected by the SLR. Specifically the method to aggregate data from one period to the next was established prior to the IQAASL sampling period. The methods were written in Python and are considered a product of the project.

The current database of beach-litter surveys in Switzerland includes over 1,500 samples collected using the same protocol with two distinct sampling periods of twelve months. Switzerland has all the elements in place to accurately estimate minimum probable values for the most common objects and evaluate stochastic changes on a monthly interval.

There are many improvements to be made concerning the national strategy:

  1. Defining a standardized reporting method for municipal, cantonal and federal stakeholders

  2. Define monitoring or assessment goals

  3. Formalize the data repository and the method for implementation at different administrative levels

  4. Develop a network of associations that share the responsibility and resources for surveying the territory

  5. Develop and implement a formal training program for surveyors

  6. Determine, in collaboration with academic partners, ideal sampling scenarios and research needs

  7. Develop a financing method to ensure that enough samples are collected each year in each survey area so that accurate assessments can be made and research requirements are met.



This project was made possible by the Swiss federal office for the environment.

Love what you do. ❤️

roger@hammerdirt.ch pushed the run button on 2021-09-14.
This document originates from https://github.com/hammerdirt-analyst/IQAASL-End-0f-Sampling-2021 all copyrights apply.

6.8. Annex

  1. surveyors

  2. municipalities, lakes, rivers

The average length and survey area for each location was used to replace the missing values for each record.

6.8.1. IQAASL surveyors

Hammerdirt staff:

  1. Shannon Erismann, field operations manager

  2. Helen Kurukulasuriya, surveyor

  3. Débora Carmo, surveyor

  4. Bettina Siegenthaler, surveyor

  5. Roger Erismann, surveyor

Participating organizations:

  1. Precious plastic leman

  2. Association pour la sauvetage du Léman

  3. Geneva international School

  4. Solid waste management: École polytechnique fédéral Lausanne

6.8.2. Municipalities, lakes and rivers with surveys

lakes = dfBeaches.loc[(dfBeaches.index.isin(fd_locs))&(dfBeaches.water == 'l')]['water_name'].unique()
rivers = dfBeaches.loc[(dfBeaches.index.isin(fd_locs))&(dfBeaches.water == 'r')]['water_name'].unique()
munis_joined = ', '.join(sorted(fd_pop_map.index))

muni_string = F"""**The municipalities in this report:**\n\n{munis_joined}
"""
md(muni_string)

The municipalities in this report:

Aarau, Allaman, Ascona, Beatenberg, Bellinzona, Bern, Biel/Bienne, Boudry, Bourg-en-Lavaux, Brienz (BE), Brissago, Brugg, Brügg, Burgdorf, Bönigen, Cheyres-Châbles, Cudrefin, Dietikon, Erlach, Estavayer, Freienbach, Gals, Gambarogno, Gebenstorf, Genève, Gland, Glarus Nord, Grandson, Hauterive (NE), Hünenberg, Kallnach, Köniz, Küsnacht (ZH), La Tour-de-Peilz, Lausanne, Lavey-Morcles, Le Landeron, Leuk, Ligerz, Locarno, Lugano, Luterbach, Lüscherz, Merenschwand, Minusio, Montreux, Neuchâtel, Nidau, Port, Préverenges, Quarten, Rapperswil-Jona, Richterswil, Riddes, Rubigen, Saint-Gingolph, Saint-Sulpice (VD), Salgesch, Schmerikon, Sion, Solothurn, Spiez, Stäfa, Thun, Tolochenaz, Unterengstringen, Unterseen, Versoix, Vevey, Vinelz, Walenstadt, Walperswil, Weesen, Weggis, Yverdon-les-Bains, Zug, Zürich

lakes_joined = ', '.join(sorted(lakes))

lake_string = F"""**The lakes in this report:**\n\n{lakes_joined}
"""
md(lake_string)

The lakes in this report:

Bielersee, Brienzersee, Lac Léman, Lago Maggiore, Lago di Lugano, Neuenburgersee, Quatre Cantons, Thunersee, Walensee, Zugersee, Zurichsee

The rivers in this report:

Aare, Aare|Nidau-Büren-Kanal, Cassarate, Dorfbach, Emme, Escherkanal, Jona, La Thièle, Limmat, Linthkanal, Maggia, Reuss, Rhône, Schüss, Seez, Sihl, Ticino

obj_string = '{:,}'.format(fd_n_obj)
surv_string = '{:,}'.format(fd_n_samps)
pop_string = '{:,}'.format(int(fd_effected_population[0]))

date_quantity_context = F"For the period between {start_date[:-3]} and {end_date[:-3]}, {obj_string } objects were removed and identified in the course of {surv_string} surveys."
geo_context = F"Those surveys were conducted at {fd_n_locs} different locations."
admin_context = F"There are {fd_n_munis} different municipalities represented in these results with a combined population of approximately {pop_string}."
md(F"{date_quantity_context} {geo_context } {admin_context}")

For the period between 2020-03 and 2021-05, 54,744 objects were removed and identified in the course of 386 surveys. Those surveys were conducted at 143 different locations. There are 77 different municipalities represented in these results with a combined population of approximately 1,735,991.

6.8.3. Results of Speramans ranked correlation test